##Stewart and Zhukov 09
##Data Generation
##February 3rd, 2009

set.seed(12345)
library(randomForest)
library(kknn)
library(svmpath)
library(adabag)
library(car)


##Ensemble

data <- read.delim('TFIDFLSA.csv', sep=',')
training <- read.delim('TrainingData.csv', sep=',')
training <- training[,c(2,3,4)]
data1 <- read.delim('MUTINFLSA500.csv', sep=',')


v <- 25 ##Verification Sample
wholelist <- c()
sample <- sort(sample(300, v, replace = FALSE))

###############################
#########RANDOM FOREST#########
###############################


##Import Features and Training Information

	##Dataset of Only Known Data
	x <- cbind(training, data)
	x <- x[which(x$TRAININGSET_SELECTION==1),]
	x$TRAININGSET_VALUE <- as.factor(x$TRAININGSET_VALUE)

	##Seperate Test and Training Data
	trainingsubset <- x[-sample,]
	
	##Define Random Forest Inputs x, y, xtest, ytest
	x <- trainingsubset[,-c(1,2,3,4)] ##Matrix of Predictors
	y <- trainingsubset[,2] ## Response Vector: 2 for Use of Force, 3 for Issue

	results <- randomForest(x=x, y=y)
	RFpredictions <- predict(results, data, type="response")

###############################
######K-NEAREST NEIGHBOR#######
###############################

##Import Features and Training Information

	##Dataset of Only Known Data
	x <- cbind(training, data)
	x <- x[which(x$TRAININGSET_SELECTION==1),]
	x <- as.data.frame(x) ##Needs to be a dataframe
	x$TRAININGSET_VALUE <- recode(x$TRAININGSET_VALUE, "0='C'; 1='A'; else='N'") ##Needs String DVs
	x$TRAININGSET_VALUE <- as.factor(x$TRAININGSET_VALUE)

	##Seperate Test and Training Data
	trainingsubset <- x[-sample,]
	testsubset <- x[sample,]

	##Define KNN Inputs x, y, xtest, ytest
	x <- trainingsubset[,-c(1,3,4)] ##Matrix of Predictors
	TRAININGSET_VALUE <- training$TRAININGSET_VALUE
	xtest <- cbind(TRAININGSET_VALUE, data)
	xtest$TRAININGSET_VALUE <- recode(xtest$TRAININGSET_VALUE, "NA='Y'; else='Z'")

	results <- kknn(TRAININGSET_VALUE ~., train = x, test = xtest, k = 5, kernel = "rectangular")
	KNNpredictions <- predict(results, data)
	KNNpredictions <- recode(KNNpredictions, "'C'=0; 'A'=1; else=99")

###################
###ADABOOST########
###################

	##Dataset of Only Known Data
	x <- cbind(training, data)
	x <- x[which(x$TRAININGSET_SELECTION==1),]
	x$TRAININGSET_VALUE <- as.factor(x$TRAININGSET_VALUE)
	x <- x[,c(-1, -3, -4)]

	results <- adaboost.M1(TRAININGSET_VALUE ~ ., data = x[-sample,], boos = TRUE, mfinal = 100, coeflearn = 'Breiman', minsplit = 5, cp = 0.01, maxdepth = 18)
	TRAININGSET_VALUE <- training$TRAININGSET_VALUE
	xtest <- cbind(TRAININGSET_VALUE, data[,-1])
	xtest$TRAININGSET_VALUE <- as.factor(xtest$TRAININGSET_VALUE)
	ABpredictions <- predict.boosting(results,xtest)$class
	ABpredictions <- as.factor(ABpredictions)

###############################
####SUPPORT VECTOR MACHINE#####
###############################


##Import Features and Training Information

	x <- cbind(training, data1)
	x <- x[which(x$TRAININGSET_SELECTION==1),]
	y <- x$TRAININGSET_VALUE
	y <- as.numeric(y)

	##Seperate Test and Training Data
	trainingsubset <- x[-sample,]
	testsubset <- x[sample,]

##Define SVM Inputs
	x <- trainingsubset[,-c(1,2,3,4)] ##Matrix of Predictors
	x <- as.matrix(x)
	ytest <- y[sample]
	y <- y[-sample]
	NAvalue <- recode(y, "99=1; else=-1") ##Code for NA Distinction
	Avalue <- recode(y, "1=1; else=-1") ##Code for A Distinction
	Cvalue <- recode(y, "0=1; else=-1") ##Code for C Distinction

	ytest <- recode(ytest, "99=1; 1=2; 0=3") ##These values match to the order of categories produced below, marked REF1

	xtest <- as.matrix(data1[,-1])

	results <- svmpath(x,NAvalue, epsilon=1e-6, lambda=0.1)
	predNA <- predict(results, xtest, lambda=1)
	results <- svmpath(x,Avalue, epsilon=1e-6, lambda=0.1)
	predA <- predict(results, xtest, lambda=1)
	results <- svmpath(x,Cvalue, epsilon=1e-6, lambda=0.1)
	predC <- predict(results, xtest, lambda=1)

	SVMpredictions <- c()
	for (k in 1:nrow(xtest)) {
		SVMpredictions[k] <- which.max(c(predNA[k], predA[k], predC[k])) ##REF1, determines recoding of ytest above
	}

	SVMpredictions <- recode(SVMpredictions, "1=99; 2=1; 3=0") 
	SVMpredictions <- as.factor(SVMpredictions)


################################
###ENSEMBLE LIST PRINT OUT######
################################

totlist <- cbind(data$labels, RFpredictions, KNNpredictions, ABpredictions, SVMpredictions)  

p <- 1 - 0.3435694
p1 <- log (p/(1-p))
p <- 1-.37808
p2 <- log (p/(1-p))
p <- 1- .3520542
p3 <- log (p/(1-p))
p <- 1-.48084
p4 <- log (p/(1-p))
wp1 <- c()
wp2 <- c()
wp3 <- c()
wholelist <- totlist
wclass <- c()
for (i in 1:nrow(wholelist)) {
	wp1[i] <- ((wholelist[i,2]==1)*p1 + (wholelist[i,3]==1)*p2 + (wholelist[i,4]==1)*p3 + (wholelist[i,5]==1)*p4)/(p1 + p2 + p3 + p4)   
	wp2[i] <- ((wholelist[i,2]==2)*p1 + (wholelist[i,3]==2)*p2 + (wholelist[i,4]==2)*p3 + (wholelist[i,5]==2)*p4)/(p1 + p2 + p3 + p4)
	wp3[i] <- ((wholelist[i,2]==3)*p1 + (wholelist[i,3]==3)*p2 + (wholelist[i,4]==3)*p3 + (wholelist[i,5]==3)*p4)/(p1 + p2 + p3 + p4)
	wclass[i] <- which.max(c(wp1[i], wp2[i], wp3[i])) 
}

classmatrix <- cbind(totlist, wp1, wp2, wp3, wclass)
write.csv(classmatrix, file = "ProbabilityMatrix.csv", quote = FALSE, row.names = FALSE)

##Single Classification
output <- as.data.frame(data$labels)
output$singleclass <- wclass
#write.csv(output, file = "SingleClass.csv", quote = FALSE, row.names = FALSE)

##Probabilistic Classification
set.seed(12345)
mat <- matrix(data = NA, nrow = nrow(output), ncol = 10000, byrow = FALSE, dimnames = NULL)

for (q in 1:10000) {
	for (i in 1:nrow(output)) {
		mat[i,q] <- sample(x=c(1,2,3), size= 1, prob=c(wp1[i], wp2[i], wp3[i]))	
	}
}
multioutput <- as.data.frame(data$labels)
multioutput$multiclass <- mat
#write.csv(multioutput, file = "MultiClass.csv", quote = FALSE, row.names = FALSE)
